# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import Any, List, Mapping, Optional, Text

from language.canine.tydiqa import char_splitter
from language.canine.tydiqa import data
from language.canine.tydiqa import tf_io
from language.canine.tydiqa import tydi_tokenization_interface
import tensorflow.compat.v1 as tf


# For test_srcdir
flags = tf.flags
FLAGS = flags.FLAGS


def make_tokenizer() -> tydi_tokenization_interface.TokenizerWithOffsets:
  return char_splitter.CharacterSplitter()


def _print_debug_info(results: List[tf.train.Example],
                      creator_fn: tf_io.CreateTFExampleFn):
  for r, result in enumerate(results):
    print(f'\nRESULT[{r}]')
    # print(text_format.MessageToString(result))
    for name, feature in result.features.feature.items():
      values = feature.int64_list.value
      if name == 'input_ids':
        values = [
            creator_fn.tokenizer.id_to_string(i).replace('°', '@')
            for i in values
        ]
      print(f'{name}\t{list(values)}')


# As returned by `preproc.create_entry_from_json`. The 'contexts' string
# contains passage markers generated by TyDiTokenizer.get_passage_marker
# are inserted by preproc.create_entry_from_json.
_ENTRY_MIN_ANSWER = {
    'id': '111',
    'language': 'english',
    'name': 'Zebra finch',
    'question': {
        'input_text': "Where are a zebra finch's stripes located?"
    },
    'plaintext':
        'The zebra finch is the most common estrildid finch. The bird '
        'has been introduced to Puerto Rico.\n'
        'The body temperature (as measured from the cloaca) of the zebra '
        'finch may vary from 38 to 44 °C.\n'
        'The zebra finch was first collected in 1801 during Nicolas '
        "Baudin's expedition to Australia. It was described in 1817 by "
        'Louis Jean Pierre Vieillot in his Nouveau Dictionnaire '
        "d'Histoire Naturelle.\n"
        'Morphological differences between the subspecies. Males do not '
        'have the fine barring found on the throat and upper breast.\n'
        'Symmetry of both plumage, like chest bands, and artificial '
        'features, like leg bands, are preferred by the female.\n'
        'Nest predators of the zebra finch include the tiger snake.',
    'contexts':
        '\ue006 The zebra finch is the most common estrildid '
        'finch. The bird has been introduced to Puerto Rico. '
        '\ue007 The body temperature (as measured from the cloaca) '
        'of the zebra finch may vary from 38 to 44 °C. \ue008 The '
        "zebra finch was first collected in 1801 during Nicolas Baudin's "
        'expedition to Australia. It was described in 1817 by Louis Jean '
        "Pierre Vieillot in his Nouveau Dictionnaire d'Histoire "
        'Naturelle. \ue009 Morphological differences between the '
        'subspecies. Males do not have the fine barring found on the '
        'throat and upper breast. \ue010 Symmetry of both plumage, '
        'like chest bands, and artificial features, like leg bands, are '
        'preferred by the female. \ue011 Nest predators of the '
        'zebra finch include the tiger snake.',
    'context_to_plaintext_offset': [
        -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
        70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
        88, 89, 90, 91, 92, 93, 94, 95, -1, -1, -1, -1, -1, 97, 98, 99, 100,
        101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
        171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
        185, 186, 187, 188, 189, 190, 191, 192, 193, -1, -1, -1, -1, -1, 195,
        196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
        210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
        238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
        252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265,
        266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279,
        280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293,
        294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307,
        308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321,
        322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335,
        336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
        350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
        364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
        378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
        -1, -1, -1, -1, -1, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402,
        403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
        417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430,
        431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444,
        445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458,
        459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472,
        473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486,
        487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500,
        501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514,
        -1, -1, -1, -1, -1, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525,
        526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539,
        540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553,
        554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567,
        568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581,
        582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595,
        596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609,
        610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
        624, 625, 626, 627, 628, -1, -1, -1, -1, -1, 630, 631, 632, 633, 634,
        635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648,
        649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662,
        663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676,
        677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687
    ],
    'answer': {
        'candidate_id': 3,  # Passage.
        'input_text': 'minimal',
        'span_start': 507,  # Start of answer span in `contexts`.
        'span_end': 530,  # Limit of answer span in `contexts`.
        'span_text': 'throat and upper breast'
    },
    'has_correct_context': True,
}

# As returned by `preproc.create_entry_from_json`. The 'contexts' string
# contains passage markers generated by TyDiTokenizer.get_passage_marker
# are inserted by preproc.create_entry_from_json.
_ENTRY_PASSAGE_ANSWER = {
    'id': '200',
    'language': 'english',
    'name': 'Zebra finch',
    'question': {
        'input_text': 'Something without a minimal answer?'
    },
    'plaintext': 'The zebra finch is the most common estrildid finch.\n'
                 'The body temperature may vary from 38 to 44 °C.\n'
                 'Nest predators include the tiger snake.',
    'contexts': '\ue006 The zebra finch is the most common estrildid finch. '
                '\ue007 The body temperature may vary from 38 to 44 °C. '
                '\ue008 Nest predators include the tiger snake.',
    'context_to_plaintext_offset': [
        -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, -1,
        -1, -1, -1, -1, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
        66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, -1, -1,
        -1, -1, -1, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
        113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
        127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139
    ],
    'answer': {
        'candidate_id': 1,
        'input_text': 'passage',
        'span_start': 60,
        'span_end': 108,
        'span_text': 'The body temperature may vary from 38 to 44 °C.'
    },
    'has_correct_context': True,
}

# As returned by `preproc.create_entry_from_json`. The 'contexts' string
# contains passage markers generated by TyDiTokenizer.get_passage_marker
# are inserted by preproc.create_entry_from_json.
_ENTRY_NO_ANSWER = {
    'id': '200',
    'language': 'english',
    'name': 'Zebra finch',
    'question': {
        'input_text': 'Something without a minimal answer?'
    },
    'plaintext': 'The zebra finch is the most common estrildid finch.\n'
                 'The body temperature may vary from 38 to 44 °C.',
    'contexts': '\ue006 The zebra finch is the most common estrildid finch. '
                '\ue007 The body temperature may vary from 38 to 44 °C.',
    'context_to_plaintext_offset': [
        -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, -1,
        -1, -1, -1, -1, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
        66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99
    ],
    'answer': {
        'candidate_id': -1,
        'input_text': 'passage',
        'span_start': -1,
        'span_end': -1,
        'span_text': ''
    },
    'has_correct_context': False,
}


class TfIoTest(tf.test.TestCase):

  def assertCreateTfexampleFnResult(
      self,
      entry: Mapping[Text, Any],
      creator_fn: tf_io.CreateTFExampleFn,
      result: tf.train.Example,
      result_index: int,
      expected_language: data.Language,
      expected_input_ids_length: int,
      expected_question_ids_length: int,
      expected_content_pieces: List[Text],
      expected_answer_type: Optional[data.AnswerType] = None,
      expected_answer_span_start: Optional[int] = None,
      expected_answer_span_end: Optional[int] = None,
      expected_answer_span_pieces: Optional[List[Text]] = None,
  ):
    """Checks the results of running `CreateTFExampleFn.process`.

    Args:
      entry: The output of `preproc.create_entry_from_json`.
      creator_fn: The object being tested, to retrieve the tokenizer object and
        is_training setting.
      result: An output from `CreateTFExampleFn.process`.
      result_index: The index of `result` within the list returned by
        `CreateTFExampleFn.process`.
      expected_language: The expected language in the result object.
      expected_input_ids_length: The expected size of the `input_ids` vector.
      expected_question_ids_length: The expected size of the "question" portion
        of the `input_ids` vector.
      expected_content_pieces: The contents of the `input_ids` vector, up to the
        start of the padding, represented by the human-readable strings that
        would be returned by TyDiTokenizer.id_to_string (though the underlying
        code will actually be returning integer IDs).
      expected_answer_type: Required iff is_training==True.
      expected_answer_span_start: Required iff is_training==True.
      expected_answer_span_end: Required iff is_training==True.
      expected_answer_span_pieces: Required iff is_training==True.
    """

    def feature_vals(key: Text) -> List[int]:
      return result.features.feature[key].int64_list.value

    if creator_fn.is_training:
      self.assertLen(result.features.feature, 9)
    else:
      self.assertLen(result.features.feature, 8)

    # Assert that `result` has the expected indices.
    entry_id = int(entry['id'])
    self.assertEqual(feature_vals('example_index'), [entry_id])
    self.assertEqual(feature_vals('unique_ids'), [entry_id + result_index])

    # Assert that `language_id` has the expected content.
    self.assertEqual(feature_vals('language_id'), [int(expected_language)])

    # Assert that `input_ids` is the expected length (i.e. `max_seq_length`).
    self.assertLen(feature_vals('input_ids'), expected_input_ids_length)

    # Compute the expected number of pieces that contain actual content, and
    # the expected number of padding pieces.
    content_len = len(expected_content_pieces)
    pad_len = expected_input_ids_length - content_len
    padding_pieces = ['[PAD]'] * pad_len

    # Assert that `input_ids` has the right content by converting `result`'s IDs
    # to strings and appending padding pieces to `expected_content_pieces`.
    input_pieces = [
        creator_fn.tokenizer.id_to_string(i) for i in feature_vals('input_ids')
    ]
    self.assertAllEqual(input_pieces, expected_content_pieces + padding_pieces)

    # Assert that `segment_ids` has the right content.
    question_segment = [0] * expected_question_ids_length
    passages_segment = [1] * (content_len - expected_question_ids_length)
    padding_segment = [0] * pad_len
    self.assertEqual(
        feature_vals('segment_ids'),
        question_segment + passages_segment + padding_segment)

    # Assert that `input_mask` has the right content.
    content_mask = [1] * content_len
    padding_mask = [0] * pad_len
    self.assertEqual(feature_vals('input_mask'), content_mask + padding_mask)

    # Some features are only present for training data, and some only for
    # non-training data.
    if creator_fn.is_training:
      self.assertEqual(
          feature_vals('answer_types'), [int(expected_answer_type)])

      # Assert that the answer wordpiece span offsets are as expected.
      self.assertEqual(
          feature_vals('start_positions'), [expected_answer_span_start])
      self.assertEqual(
          feature_vals('end_positions'), [expected_answer_span_end])
      # Assert that `result`'s `input_ids` contain the expected span of pieces
      # at that location.
      # Note that the end position is INCLUSIVE.
      self.assertEqual(
          input_pieces[expected_answer_span_start:expected_answer_span_end + 1],
          expected_answer_span_pieces)

    else:
      wp_start_offset = feature_vals('wp_start_offset')
      wp_end_offset = feature_vals('wp_end_offset')

      # Assert that the piece-to-plaintext-byte mapping covers exactly the size
      # of `result`'s `input_ids`.
      self.assertEqual(len(wp_start_offset), len(wp_end_offset))
      self.assertEqual(len(wp_start_offset), len(input_pieces))

      # Check each piece-to-plaintext-byte entry.
      for i, (piece, start_byte, end_byte) in enumerate(
          zip(input_pieces, wp_start_offset, wp_end_offset)):
        if i < expected_question_ids_length:
          # Asserts that pieces that are part of the question portion of the
          # input are never mapped to positions in the plaintext.
          self.assertEqual(start_byte, -1)
          self.assertEqual(end_byte, -1)

        elif i < content_len:
          # For the pieces that make up the text passages:
          # Either both offsets must be present, or both must be absent.
          self.assertEqual(start_byte != -1, end_byte != -1)
          if start_byte == -1:
            # Since this piece is not mapped to bytes in the plaintext,
            # assert that it is an expected special symbol (or potentially a
            # space predceding/following one of those symbols).
            self.assertRegex(
                piece, '^(\\[SEP\\])|([\ue000-\uf8ff ])$',
                f'i={i}, piece={piece}, '
                f'start_byte={start_byte}, end_byte={end_byte}')
            if piece == ' ':
              self.assertTrue(
                  (re.match('^[\ue000-\uf8ff]$', input_pieces[i - 1]) or
                   re.match('^[\ue000-\uf8ff]$', input_pieces[i + 1])),
                  f'i={i}, piece={piece}, '
                  f'start_byte={start_byte}, end_byte={end_byte}')
          else:
            # Since this piece is mapped to a byte span in the plaintext,
            # assert that the plaintext span matches the corresponding piece.
            # Note that the end offset is INCLUSIVE.
            plaintext_span = (
                entry['plaintext'].encode()[start_byte:end_byte + 1].decode())
            clean_piece = piece
            if clean_piece.startswith('##'):
              # Remove the special wordpiece marker.
              clean_piece = clean_piece[2:]
            elif clean_piece.startswith('▁'):
              # Remove the special sentencepiece marker.
              clean_piece = clean_piece[1:]
            self.assertEqual(plaintext_span, clean_piece)

        else:
          # Asserts that padding pieces are never mapped to positions in the
          # plaintext.
          self.assertEqual(start_byte, -1)
          self.assertEqual(end_byte, -1)

  def test_create_tfexample_fn_min_answer_wide_train(self):
    creator_fn = tf_io.CreateTFExampleFn(
        is_training=True,
        max_question_length=64,
        max_seq_length=1024,
        doc_stride=128,
        include_unknowns=-1.0,
        tokenizer=make_tokenizer())
    errors = []
    results: List[tf.train.Example] = list(
        creator_fn.process(_ENTRY_MIN_ANSWER, errors=errors))
    self.assertEmpty(errors)

    self.assertLen(results, 1)
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[0],
        result_index=0,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=1024,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', '\ue006', ' ', 'T', 'h', 'e',
            ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ',
            'i', 's', ' ', 't', 'h', 'e', ' ', 'm', 'o', 's', 't', ' ', 'c',
            'o', 'm', 'm', 'o', 'n', ' ', 'e', 's', 't', 'r', 'i', 'l', 'd',
            'i', 'd', ' ', 'f', 'i', 'n', 'c', 'h', '.', ' ', 'T', 'h', 'e',
            ' ', 'b', 'i', 'r', 'd', ' ', 'h', 'a', 's', ' ', 'b', 'e', 'e',
            'n', ' ', 'i', 'n', 't', 'r', 'o', 'd', 'u', 'c', 'e', 'd', ' ',
            't', 'o', ' ', 'P', 'u', 'e', 'r', 't', 'o', ' ', 'R', 'i', 'c',
            'o', '.', ' ', '\ue007', ' ', 'T', 'h', 'e', ' ', 'b', 'o', 'd',
            'y', ' ', 't', 'e', 'm', 'p', 'e', 'r', 'a', 't', 'u', 'r', 'e',
            ' ', '(', 'a', 's', ' ', 'm', 'e', 'a', 's', 'u', 'r', 'e', 'd',
            ' ', 'f', 'r', 'o', 'm', ' ', 't', 'h', 'e', ' ', 'c', 'l', 'o',
            'a', 'c', 'a', ')', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'z',
            'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ', 'm', 'a',
            'y', ' ', 'v', 'a', 'r', 'y', ' ', 'f', 'r', 'o', 'm', ' ', '3',
            '8', ' ', 't', 'o', ' ', '4', '4', ' ', '°', 'C', '.', ' ',
            '\ue008', ' ', 'T', 'h', 'e', ' ', 'z', 'e', 'b', 'r', 'a', ' ',
            'f', 'i', 'n', 'c', 'h', ' ', 'w', 'a', 's', ' ', 'f', 'i', 'r',
            's', 't', ' ', 'c', 'o', 'l', 'l', 'e', 'c', 't', 'e', 'd', ' ',
            'i', 'n', ' ', '1', '8', '0', '1', ' ', 'd', 'u', 'r', 'i', 'n',
            'g', ' ', 'N', 'i', 'c', 'o', 'l', 'a', 's', ' ', 'B', 'a', 'u',
            'd', 'i', 'n', "'", 's', ' ', 'e', 'x', 'p', 'e', 'd', 'i', 't',
            'i', 'o', 'n', ' ', 't', 'o', ' ', 'A', 'u', 's', 't', 'r', 'a',
            'l', 'i', 'a', '.', ' ', 'I', 't', ' ', 'w', 'a', 's', ' ', 'd',
            'e', 's', 'c', 'r', 'i', 'b', 'e', 'd', ' ', 'i', 'n', ' ', '1',
            '8', '1', '7', ' ', 'b', 'y', ' ', 'L', 'o', 'u', 'i', 's', ' ',
            'J', 'e', 'a', 'n', ' ', 'P', 'i', 'e', 'r', 'r', 'e', ' ', 'V',
            'i', 'e', 'i', 'l', 'l', 'o', 't', ' ', 'i', 'n', ' ', 'h', 'i',
            's', ' ', 'N', 'o', 'u', 'v', 'e', 'a', 'u', ' ', 'D', 'i', 'c',
            't', 'i', 'o', 'n', 'n', 'a', 'i', 'r', 'e', ' ', 'd', "'", 'H',
            'i', 's', 't', 'o', 'i', 'r', 'e', ' ', 'N', 'a', 't', 'u', 'r',
            'e', 'l', 'l', 'e', '.', ' ', '\ue009', ' ', 'M', 'o', 'r', 'p',
            'h', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'd', 'i', 'f',
            'f', 'e', 'r', 'e', 'n', 'c', 'e', 's', ' ', 'b', 'e', 't', 'w',
            'e', 'e', 'n', ' ', 't', 'h', 'e', ' ', 's', 'u', 'b', 's', 'p',
            'e', 'c', 'i', 'e', 's', '.', ' ', 'M', 'a', 'l', 'e', 's', ' ',
            'd', 'o', ' ', 'n', 'o', 't', ' ', 'h', 'a', 'v', 'e', ' ', 't',
            'h', 'e', ' ', 'f', 'i', 'n', 'e', ' ', 'b', 'a', 'r', 'r', 'i',
            'n', 'g', ' ', 'f', 'o', 'u', 'n', 'd', ' ', 'o', 'n', ' ', 't',
            'h', 'e', ' ', 't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n', 'd',
            ' ', 'u', 'p', 'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's', 't',
            '.', ' ', '\ue010', ' ', 'S', 'y', 'm', 'm', 'e', 't', 'r', 'y',
            ' ', 'o', 'f', ' ', 'b', 'o', 't', 'h', ' ', 'p', 'l', 'u', 'm',
            'a', 'g', 'e', ',', ' ', 'l', 'i', 'k', 'e', ' ', 'c', 'h', 'e',
            's', 't', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ', 'a', 'n', 'd',
            ' ', 'a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l', ' ', 'f',
            'e', 'a', 't', 'u', 'r', 'e', 's', ',', ' ', 'l', 'i', 'k', 'e',
            ' ', 'l', 'e', 'g', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ', 'a',
            'r', 'e', ' ', 'p', 'r', 'e', 'f', 'e', 'r', 'r', 'e', 'd', ' ',
            'b', 'y', ' ', 't', 'h', 'e', ' ', 'f', 'e', 'm', 'a', 'l', 'e',
            '.', ' ', '\ue011', ' ', 'N', 'e', 's', 't', ' ', 'p', 'r', 'e',
            'd', 'a', 't', 'o', 'r', 's', ' ', 'o', 'f', ' ', 't', 'h', 'e',
            ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ',
            'i', 'n', 'c', 'l', 'u', 'd', 'e', ' ', 't', 'h', 'e', ' ', 't',
            'i', 'g', 'e', 'r', ' ', 's', 'n', 'a', 'k', 'e', '.', '[SEP]'
        ],
        expected_answer_type=data.AnswerType.MINIMAL,
        expected_answer_span_start=543,
        expected_answer_span_end=565,
        expected_answer_span_pieces=[
            't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n', 'd', ' ', 'u', 'p',
            'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's', 't'
        ])

  def test_create_tfexample_fn_min_answer_wide_predict(self):
    creator_fn = tf_io.CreateTFExampleFn(
        is_training=False,
        max_question_length=64,
        max_seq_length=1024,
        doc_stride=128,
        include_unknowns=-1.0,
        tokenizer=make_tokenizer())
    errors = []
    results: List[tf.train.Example] = list(
        creator_fn.process(_ENTRY_MIN_ANSWER, errors=errors))
    self.assertEmpty(errors)

    self.assertLen(results, 1)
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[0],
        result_index=0,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=1024,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', '\ue006', ' ', 'T', 'h', 'e',
            ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ',
            'i', 's', ' ', 't', 'h', 'e', ' ', 'm', 'o', 's', 't', ' ', 'c',
            'o', 'm', 'm', 'o', 'n', ' ', 'e', 's', 't', 'r', 'i', 'l', 'd',
            'i', 'd', ' ', 'f', 'i', 'n', 'c', 'h', '.', ' ', 'T', 'h', 'e',
            ' ', 'b', 'i', 'r', 'd', ' ', 'h', 'a', 's', ' ', 'b', 'e', 'e',
            'n', ' ', 'i', 'n', 't', 'r', 'o', 'd', 'u', 'c', 'e', 'd', ' ',
            't', 'o', ' ', 'P', 'u', 'e', 'r', 't', 'o', ' ', 'R', 'i', 'c',
            'o', '.', ' ', '\ue007', ' ', 'T', 'h', 'e', ' ', 'b', 'o', 'd',
            'y', ' ', 't', 'e', 'm', 'p', 'e', 'r', 'a', 't', 'u', 'r', 'e',
            ' ', '(', 'a', 's', ' ', 'm', 'e', 'a', 's', 'u', 'r', 'e', 'd',
            ' ', 'f', 'r', 'o', 'm', ' ', 't', 'h', 'e', ' ', 'c', 'l', 'o',
            'a', 'c', 'a', ')', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'z',
            'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ', 'm', 'a',
            'y', ' ', 'v', 'a', 'r', 'y', ' ', 'f', 'r', 'o', 'm', ' ', '3',
            '8', ' ', 't', 'o', ' ', '4', '4', ' ', '°', 'C', '.', ' ',
            '\ue008', ' ', 'T', 'h', 'e', ' ', 'z', 'e', 'b', 'r', 'a', ' ',
            'f', 'i', 'n', 'c', 'h', ' ', 'w', 'a', 's', ' ', 'f', 'i', 'r',
            's', 't', ' ', 'c', 'o', 'l', 'l', 'e', 'c', 't', 'e', 'd', ' ',
            'i', 'n', ' ', '1', '8', '0', '1', ' ', 'd', 'u', 'r', 'i', 'n',
            'g', ' ', 'N', 'i', 'c', 'o', 'l', 'a', 's', ' ', 'B', 'a', 'u',
            'd', 'i', 'n', "'", 's', ' ', 'e', 'x', 'p', 'e', 'd', 'i', 't',
            'i', 'o', 'n', ' ', 't', 'o', ' ', 'A', 'u', 's', 't', 'r', 'a',
            'l', 'i', 'a', '.', ' ', 'I', 't', ' ', 'w', 'a', 's', ' ', 'd',
            'e', 's', 'c', 'r', 'i', 'b', 'e', 'd', ' ', 'i', 'n', ' ', '1',
            '8', '1', '7', ' ', 'b', 'y', ' ', 'L', 'o', 'u', 'i', 's', ' ',
            'J', 'e', 'a', 'n', ' ', 'P', 'i', 'e', 'r', 'r', 'e', ' ', 'V',
            'i', 'e', 'i', 'l', 'l', 'o', 't', ' ', 'i', 'n', ' ', 'h', 'i',
            's', ' ', 'N', 'o', 'u', 'v', 'e', 'a', 'u', ' ', 'D', 'i', 'c',
            't', 'i', 'o', 'n', 'n', 'a', 'i', 'r', 'e', ' ', 'd', "'", 'H',
            'i', 's', 't', 'o', 'i', 'r', 'e', ' ', 'N', 'a', 't', 'u', 'r',
            'e', 'l', 'l', 'e', '.', ' ', '\ue009', ' ', 'M', 'o', 'r', 'p',
            'h', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'd', 'i', 'f',
            'f', 'e', 'r', 'e', 'n', 'c', 'e', 's', ' ', 'b', 'e', 't', 'w',
            'e', 'e', 'n', ' ', 't', 'h', 'e', ' ', 's', 'u', 'b', 's', 'p',
            'e', 'c', 'i', 'e', 's', '.', ' ', 'M', 'a', 'l', 'e', 's', ' ',
            'd', 'o', ' ', 'n', 'o', 't', ' ', 'h', 'a', 'v', 'e', ' ', 't',
            'h', 'e', ' ', 'f', 'i', 'n', 'e', ' ', 'b', 'a', 'r', 'r', 'i',
            'n', 'g', ' ', 'f', 'o', 'u', 'n', 'd', ' ', 'o', 'n', ' ', 't',
            'h', 'e', ' ', 't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n', 'd',
            ' ', 'u', 'p', 'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's', 't',
            '.', ' ', '\ue010', ' ', 'S', 'y', 'm', 'm', 'e', 't', 'r', 'y',
            ' ', 'o', 'f', ' ', 'b', 'o', 't', 'h', ' ', 'p', 'l', 'u', 'm',
            'a', 'g', 'e', ',', ' ', 'l', 'i', 'k', 'e', ' ', 'c', 'h', 'e',
            's', 't', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ', 'a', 'n', 'd',
            ' ', 'a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l', ' ', 'f',
            'e', 'a', 't', 'u', 'r', 'e', 's', ',', ' ', 'l', 'i', 'k', 'e',
            ' ', 'l', 'e', 'g', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ', 'a',
            'r', 'e', ' ', 'p', 'r', 'e', 'f', 'e', 'r', 'r', 'e', 'd', ' ',
            'b', 'y', ' ', 't', 'h', 'e', ' ', 'f', 'e', 'm', 'a', 'l', 'e',
            '.', ' ', '\ue011', ' ', 'N', 'e', 's', 't', ' ', 'p', 'r', 'e',
            'd', 'a', 't', 'o', 'r', 's', ' ', 'o', 'f', ' ', 't', 'h', 'e',
            ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ',
            'i', 'n', 'c', 'l', 'u', 'd', 'e', ' ', 't', 'h', 'e', ' ', 't',
            'i', 'g', 'e', 'r', ' ', 's', 'n', 'a', 'k', 'e', '.', '[SEP]'
        ])

  def test_create_tfexample_fn_min_answer_narrow_train(self):
    creator_fn = tf_io.CreateTFExampleFn(
        is_training=True,
        max_question_length=64,
        max_seq_length=256,
        doc_stride=64,
        include_unknowns=-1.0,
        tokenizer=make_tokenizer())
    entry = _ENTRY_MIN_ANSWER
    errors = []
    results: List[tf.train.Example] = list(
        creator_fn.process(entry, errors=errors))
    self.assertEmpty(errors)

    self.assertLen(results, 3)
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[0],
        result_index=5,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=256,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', ' ', 'L', 'o', 'u', 'i', 's',
            ' ', 'J', 'e', 'a', 'n', ' ', 'P', 'i', 'e', 'r', 'r', 'e', ' ',
            'V', 'i', 'e', 'i', 'l', 'l', 'o', 't', ' ', 'i', 'n', ' ', 'h',
            'i', 's', ' ', 'N', 'o', 'u', 'v', 'e', 'a', 'u', ' ', 'D', 'i',
            'c', 't', 'i', 'o', 'n', 'n', 'a', 'i', 'r', 'e', ' ', 'd', "'",
            'H', 'i', 's', 't', 'o', 'i', 'r', 'e', ' ', 'N', 'a', 't', 'u',
            'r', 'e', 'l', 'l', 'e', '.', ' ', '\ue009', ' ', 'M', 'o', 'r',
            'p', 'h', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'd', 'i',
            'f', 'f', 'e', 'r', 'e', 'n', 'c', 'e', 's', ' ', 'b', 'e', 't',
            'w', 'e', 'e', 'n', ' ', 't', 'h', 'e', ' ', 's', 'u', 'b', 's',
            'p', 'e', 'c', 'i', 'e', 's', '.', ' ', 'M', 'a', 'l', 'e', 's',
            ' ', 'd', 'o', ' ', 'n', 'o', 't', ' ', 'h', 'a', 'v', 'e', ' ',
            't', 'h', 'e', ' ', 'f', 'i', 'n', 'e', ' ', 'b', 'a', 'r', 'r',
            'i', 'n', 'g', ' ', 'f', 'o', 'u', 'n', 'd', ' ', 'o', 'n', ' ',
            't', 'h', 'e', ' ', 't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n',
            'd', ' ', 'u', 'p', 'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's',
            't', '.', ' ', '\ue010', ' ', 'S', 'y', 'm', 'm', 'e', '[SEP]'
        ],
        expected_answer_type=data.AnswerType.MINIMAL,
        expected_answer_span_start=223,
        expected_answer_span_end=245,
        expected_answer_span_pieces=[
            't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n', 'd', ' ', 'u', 'p',
            'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's', 't'
        ])
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[1],
        result_index=6,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=256,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', 'r', 'e', ' ', 'N', 'a', 't',
            'u', 'r', 'e', 'l', 'l', 'e', '.', ' ', '\ue009', ' ', 'M', 'o',
            'r', 'p', 'h', 'o', 'l', 'o', 'g', 'i', 'c', 'a', 'l', ' ', 'd',
            'i', 'f', 'f', 'e', 'r', 'e', 'n', 'c', 'e', 's', ' ', 'b', 'e',
            't', 'w', 'e', 'e', 'n', ' ', 't', 'h', 'e', ' ', 's', 'u', 'b',
            's', 'p', 'e', 'c', 'i', 'e', 's', '.', ' ', 'M', 'a', 'l', 'e',
            's', ' ', 'd', 'o', ' ', 'n', 'o', 't', ' ', 'h', 'a', 'v', 'e',
            ' ', 't', 'h', 'e', ' ', 'f', 'i', 'n', 'e', ' ', 'b', 'a', 'r',
            'r', 'i', 'n', 'g', ' ', 'f', 'o', 'u', 'n', 'd', ' ', 'o', 'n',
            ' ', 't', 'h', 'e', ' ', 't', 'h', 'r', 'o', 'a', 't', ' ', 'a',
            'n', 'd', ' ', 'u', 'p', 'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a',
            's', 't', '.', ' ', '\ue010', ' ', 'S', 'y', 'm', 'm', 'e', 't',
            'r', 'y', ' ', 'o', 'f', ' ', 'b', 'o', 't', 'h', ' ', 'p', 'l',
            'u', 'm', 'a', 'g', 'e', ',', ' ', 'l', 'i', 'k', 'e', ' ', 'c',
            'h', 'e', 's', 't', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ', 'a',
            'n', 'd', ' ', 'a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l',
            ' ', 'f', 'e', 'a', 't', 'u', 'r', 'e', 's', ',', ' ', '[SEP]'
        ],
        expected_answer_type=data.AnswerType.MINIMAL,
        expected_answer_span_start=159,
        expected_answer_span_end=181,
        expected_answer_span_pieces=[
            't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n', 'd', ' ', 'u', 'p',
            'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's', 't'
        ])
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[2],
        result_index=7,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=256,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', '.', ' ', 'M', 'a', 'l', 'e',
            's', ' ', 'd', 'o', ' ', 'n', 'o', 't', ' ', 'h', 'a', 'v', 'e',
            ' ', 't', 'h', 'e', ' ', 'f', 'i', 'n', 'e', ' ', 'b', 'a', 'r',
            'r', 'i', 'n', 'g', ' ', 'f', 'o', 'u', 'n', 'd', ' ', 'o', 'n',
            ' ', 't', 'h', 'e', ' ', 't', 'h', 'r', 'o', 'a', 't', ' ', 'a',
            'n', 'd', ' ', 'u', 'p', 'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a',
            's', 't', '.', ' ', '\ue010', ' ', 'S', 'y', 'm', 'm', 'e', 't',
            'r', 'y', ' ', 'o', 'f', ' ', 'b', 'o', 't', 'h', ' ', 'p', 'l',
            'u', 'm', 'a', 'g', 'e', ',', ' ', 'l', 'i', 'k', 'e', ' ', 'c',
            'h', 'e', 's', 't', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ', 'a',
            'n', 'd', ' ', 'a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l',
            ' ', 'f', 'e', 'a', 't', 'u', 'r', 'e', 's', ',', ' ', 'l', 'i',
            'k', 'e', ' ', 'l', 'e', 'g', ' ', 'b', 'a', 'n', 'd', 's', ',',
            ' ', 'a', 'r', 'e', ' ', 'p', 'r', 'e', 'f', 'e', 'r', 'r', 'e',
            'd', ' ', 'b', 'y', ' ', 't', 'h', 'e', ' ', 'f', 'e', 'm', 'a',
            'l', 'e', '.', ' ', '\ue011', ' ', 'N', 'e', 's', 't', ' ', 'p',
            'r', 'e', 'd', 'a', 't', 'o', 'r', 's', ' ', 'o', 'f', '[SEP]'
        ],
        expected_answer_type=data.AnswerType.MINIMAL,
        expected_answer_span_start=95,
        expected_answer_span_end=117,
        expected_answer_span_pieces=[
            't', 'h', 'r', 'o', 'a', 't', ' ', 'a', 'n', 'd', ' ', 'u', 'p',
            'p', 'e', 'r', ' ', 'b', 'r', 'e', 'a', 's', 't'
        ])

  def test_create_tfexample_fn_min_answer_narrow_predict(self):
    creator_fn = tf_io.CreateTFExampleFn(
        is_training=False,
        max_question_length=64,
        max_seq_length=256,
        doc_stride=64,
        include_unknowns=-1.0,
        tokenizer=make_tokenizer())
    entry = _ENTRY_MIN_ANSWER
    errors = []
    results: List[tf.train.Example] = list(
        creator_fn.process(entry, errors=errors))
    self.assertEmpty(errors)

    self.assertLen(results, 9)
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[0],
        result_index=0,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=256,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', '\ue006', ' ', 'T', 'h', 'e',
            ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ',
            'i', 's', ' ', 't', 'h', 'e', ' ', 'm', 'o', 's', 't', ' ', 'c',
            'o', 'm', 'm', 'o', 'n', ' ', 'e', 's', 't', 'r', 'i', 'l', 'd',
            'i', 'd', ' ', 'f', 'i', 'n', 'c', 'h', '.', ' ', 'T', 'h', 'e',
            ' ', 'b', 'i', 'r', 'd', ' ', 'h', 'a', 's', ' ', 'b', 'e', 'e',
            'n', ' ', 'i', 'n', 't', 'r', 'o', 'd', 'u', 'c', 'e', 'd', ' ',
            't', 'o', ' ', 'P', 'u', 'e', 'r', 't', 'o', ' ', 'R', 'i', 'c',
            'o', '.', ' ', '\ue007', ' ', 'T', 'h', 'e', ' ', 'b', 'o', 'd',
            'y', ' ', 't', 'e', 'm', 'p', 'e', 'r', 'a', 't', 'u', 'r', 'e',
            ' ', '(', 'a', 's', ' ', 'm', 'e', 'a', 's', 'u', 'r', 'e', 'd',
            ' ', 'f', 'r', 'o', 'm', ' ', 't', 'h', 'e', ' ', 'c', 'l', 'o',
            'a', 'c', 'a', ')', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'z',
            'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h', ' ', 'm', 'a',
            'y', ' ', 'v', 'a', 'r', 'y', ' ', 'f', 'r', 'o', 'm', ' ', '3',
            '8', ' ', 't', 'o', ' ', '4', '4', ' ', '°', 'C', '.', ' ',
            '\ue008', ' ', 'T', 'h', 'e', ' ', 'z', 'e', 'b', 'r', 'a', ' ',
            '[SEP]'
        ])
    # ...Skipping assertions for results 1-7 for brevity...
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_MIN_ANSWER,
        creator_fn=creator_fn,
        result=results[8],
        result_index=8,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=256,
        expected_question_ids_length=45,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'W', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ',
            'a', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n', 'c', 'h',
            "'", 's', ' ', 's', 't', 'r', 'i', 'p', 'e', 's', ' ', 'l', 'o',
            'c', 'a', 't', 'e', 'd', '?', '[SEP]', 'e', 'r', ' ', 'b', 'r', 'e',
            'a', 's', 't', '.', ' ', '\ue010', ' ', 'S', 'y', 'm', 'm', 'e',
            't', 'r', 'y', ' ', 'o', 'f', ' ', 'b', 'o', 't', 'h', ' ', 'p',
            'l', 'u', 'm', 'a', 'g', 'e', ',', ' ', 'l', 'i', 'k', 'e', ' ',
            'c', 'h', 'e', 's', 't', ' ', 'b', 'a', 'n', 'd', 's', ',', ' ',
            'a', 'n', 'd', ' ', 'a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a',
            'l', ' ', 'f', 'e', 'a', 't', 'u', 'r', 'e', 's', ',', ' ', 'l',
            'i', 'k', 'e', ' ', 'l', 'e', 'g', ' ', 'b', 'a', 'n', 'd', 's',
            ',', ' ', 'a', 'r', 'e', ' ', 'p', 'r', 'e', 'f', 'e', 'r', 'r',
            'e', 'd', ' ', 'b', 'y', ' ', 't', 'h', 'e', ' ', 'f', 'e', 'm',
            'a', 'l', 'e', '.', ' ', '\ue011', ' ', 'N', 'e', 's', 't', ' ',
            'p', 'r', 'e', 'd', 'a', 't', 'o', 'r', 's', ' ', 'o', 'f', ' ',
            't', 'h', 'e', ' ', 'z', 'e', 'b', 'r', 'a', ' ', 'f', 'i', 'n',
            'c', 'h', ' ', 'i', 'n', 'c', 'l', 'u', 'd', 'e', ' ', 't', 'h',
            'e', ' ', 't', 'i', 'g', 'e', 'r', ' ', 's', 'n', 'a', 'k', 'e',
            '.', '[SEP]'
        ])

  def test_create_tfexample_fn_passage_answer_wide_train(self):
    creator_fn = tf_io.CreateTFExampleFn(
        is_training=True,
        max_question_length=64,
        max_seq_length=512,
        doc_stride=128,
        include_unknowns=-1.0,
        tokenizer=make_tokenizer())
    errors = []
    results: List[tf.train.Example] = list(
        creator_fn.process(_ENTRY_PASSAGE_ANSWER, errors=errors))
    self.assertEmpty(errors)

    _print_debug_info(results, creator_fn)

    self.assertLen(results, 1)
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_PASSAGE_ANSWER,
        creator_fn=creator_fn,
        result=results[0],
        result_index=0,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=512,
        expected_question_ids_length=38,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'S', 'o', 'm', 'e', 't', 'h', 'i', 'n', 'g', ' ',
            'w', 'i', 't', 'h', 'o', 'u', 't', ' ', 'a', ' ', 'm', 'i', 'n',
            'i', 'm', 'a', 'l', ' ', 'a', 'n', 's', 'w', 'e', 'r', '?', '[SEP]',
            '\ue006', ' ', 'T', 'h', 'e', ' ', 'z', 'e', 'b', 'r', 'a', ' ',
            'f', 'i', 'n', 'c', 'h', ' ', 'i', 's', ' ', 't', 'h', 'e', ' ',
            'm', 'o', 's', 't', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 'e',
            's', 't', 'r', 'i', 'l', 'd', 'i', 'd', ' ', 'f', 'i', 'n', 'c',
            'h', '.', ' ', '\ue007', ' ', 'T', 'h', 'e', ' ', 'b', 'o', 'd',
            'y', ' ', 't', 'e', 'm', 'p', 'e', 'r', 'a', 't', 'u', 'r', 'e',
            ' ', 'm', 'a', 'y', ' ', 'v', 'a', 'r', 'y', ' ', 'f', 'r', 'o',
            'm', ' ', '3', '8', ' ', 't', 'o', ' ', '4', '4', ' ', '°', 'C',
            '.', ' ', '\ue008', ' ', 'N', 'e', 's', 't', ' ', 'p', 'r', 'e',
            'd', 'a', 't', 'o', 'r', 's', ' ', 'i', 'n', 'c', 'l', 'u', 'd',
            'e', ' ', 't', 'h', 'e', ' ', 't', 'i', 'g', 'e', 'r', ' ', 's',
            'n', 'a', 'k', 'e', '.', '[SEP]'
        ],
        expected_answer_type=data.AnswerType.PASSAGE,
        expected_answer_span_start=94,
        expected_answer_span_end=140,
        expected_answer_span_pieces=[
            'T', 'h', 'e', ' ', 'b', 'o', 'd', 'y', ' ', 't', 'e', 'm', 'p',
            'e', 'r', 'a', 't', 'u', 'r', 'e', ' ', 'm', 'a', 'y', ' ', 'v',
            'a', 'r', 'y', ' ', 'f', 'r', 'o', 'm', ' ', '3', '8', ' ', 't',
            'o', ' ', '4', '4', ' ', '°', 'C', '.'
        ])

  def test_create_tfexample_fn_passage_answer_wide_predict(self):
    creator_fn = tf_io.CreateTFExampleFn(
        is_training=False,
        max_question_length=64,
        max_seq_length=512,
        doc_stride=128,
        include_unknowns=-1.0,
        tokenizer=make_tokenizer())
    errors = []
    results: List[tf.train.Example] = list(
        creator_fn.process(_ENTRY_PASSAGE_ANSWER, errors=errors))
    self.assertEmpty(errors)

    self.assertLen(results, 1)
    self.assertCreateTfexampleFnResult(
        entry=_ENTRY_PASSAGE_ANSWER,
        creator_fn=creator_fn,
        result=results[0],
        result_index=0,
        expected_language=data.Language.ENGLISH,
        expected_input_ids_length=512,
        expected_question_ids_length=38,
        expected_content_pieces=[
            '[CLS]', '[Q]', 'S', 'o', 'm', 'e', 't', 'h', 'i', 'n', 'g', ' ',
            'w', 'i', 't', 'h', 'o', 'u', 't', ' ', 'a', ' ', 'm', 'i', 'n',
            'i', 'm', 'a', 'l', ' ', 'a', 'n', 's', 'w', 'e', 'r', '?', '[SEP]',
            '\ue006', ' ', 'T', 'h', 'e', ' ', 'z', 'e', 'b', 'r', 'a', ' ',
            'f', 'i', 'n', 'c', 'h', ' ', 'i', 's', ' ', 't', 'h', 'e', ' ',
            'm', 'o', 's', 't', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 'e',
            's', 't', 'r', 'i', 'l', 'd', 'i', 'd', ' ', 'f', 'i', 'n', 'c',
            'h', '.', ' ', '\ue007', ' ', 'T', 'h', 'e', ' ', 'b', 'o', 'd',
            'y', ' ', 't', 'e', 'm', 'p', 'e', 'r', 'a', 't', 'u', 'r', 'e',
            ' ', 'm', 'a', 'y', ' ', 'v', 'a', 'r', 'y', ' ', 'f', 'r', 'o',
            'm', ' ', '3', '8', ' ', 't', 'o', ' ', '4', '4', ' ', '°', 'C',
            '.', ' ', '\ue008', ' ', 'N', 'e', 's', 't', ' ', 'p', 'r', 'e',
            'd', 'a', 't', 'o', 'r', 's', ' ', 'i', 'n', 'c', 'l', 'u', 'd',
            'e', ' ', 't', 'h', 'e', ' ', 't', 'i', 'g', 'e', 'r', ' ', 's',
            'n', 'a', 'k', 'e', '.', '[SEP]'
        ])


if __name__ == '__main__':
  tf.test.main()
